import pandas as pd  
import jieba  
import re  
import openpyxl  
  
# 定义一个函数来检查并保留中文字符  
def is_chinese_regex(word):  
    return re.match(r'[\u4e00-\u9fa5]+', word) is not None  
  
# 读取Excel文件  
file_path = 'd:\\chapter_hemaApp_comment.xlsx'  # 替换为你的Excel文件路径  
df = pd.read_excel(file_path, usecols=['content'])  # 假设 content列为评价内容
  
# 初始化一个字典来存储词频  
word_freq = {}  
  
# 使用jieba进行分词并统计词频  
for index, row in df.iterrows():  
    text = row['content']  
    words = jieba.cut(text, cut_all=False)  # 使用精确模式进行分词  
    for word in words:  
        if is_chinese_regex(word):  # 使用正则表达式检查中文字符  
            if word in word_freq:  
                word_freq[word] += 1  
            else:  
                word_freq[word] = 1  
  
# 打印词频统计结果  
for word, freq in word_freq.items():  
    print(f"{word}: {freq}")  
  

  
# 创建一个新的Excel工作簿和工作表  
wb = openpyxl.Workbook()  
ws = wb.active  
  
# 将词频写入Excel  
ws.append(["词语", "词频"])  
for word, freq in word_freq.items():  
    ws.append([word, freq])  
  
# 保存Excel文件  
wb.save("word_frequencies.xlsx")